{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "843e90da-ad80-443e-815d-5eb854992dfd",
   "metadata": {},
   "source": [
    "# Massive Text Embedding Benchmark (MTEB) Leaderboard\n",
    "\n",
    "https://huggingface.co/spaces/mteb/leaderboard"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a60bcbb3-3e6a-4f85-8c17-a7a232429943",
   "metadata": {},
   "source": [
    "## gte"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8245250b-9777-4e7a-b136-af0322bf1d3b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch.nn.functional as F\n",
    "from torch import Tensor\n",
    "from transformers import AutoTokenizer, AutoModel\n",
    "\n",
    "input_texts = [\n",
    "    \"中国的首都是哪里\",\n",
    "    \"你喜欢去哪里旅游\",\n",
    "    \"北京\",\n",
    "    \"今天中午吃什么\"\n",
    "]\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"thenlper/gte-base-zh\")\n",
    "model = AutoModel.from_pretrained(\"thenlper/gte-base-zh\")\n",
    "\n",
    "# Tokenize the input texts\n",
    "batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')\n",
    "\n",
    "outputs = model(**batch_dict)\n",
    "embeddings = outputs.last_hidden_state[:, 0]\n",
    " \n",
    "# (Optionally) normalize embeddings\n",
    "embeddings = F.normalize(embeddings, p=2, dim=1)\n",
    "scores = (embeddings[:1] @ embeddings[1:].T) * 100\n",
    "print(scores.tolist())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c68d52df-91f4-458a-a739-8777065105e4",
   "metadata": {},
   "source": [
    "## bge"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "353abaf5-dd58-4ce4-a2c9-0252a45aa090",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sentence_transformers import SentenceTransformer\n",
    "queries = ['query_1', 'query_2']\n",
    "passages = [\"样例文档-1\", \"样例文档-2\"]\n",
    "instruction = \"为这个句子生成表示以用于检索相关文章：\"\n",
    "\n",
    "model = SentenceTransformer('BAAI/bge-large-zh-v1.5')\n",
    "q_embeddings = model.encode([instruction+q for q in queries], normalize_embeddings=True)\n",
    "p_embeddings = model.encode(passages, normalize_embeddings=True)\n",
    "scores = q_embeddings @ p_embeddings.T"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a686e3a4-b18c-4959-9ec9-cb6247ea1f5c",
   "metadata": {},
   "source": [
    "## m3e"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "13ffcbd3-595d-4daf-a5f4-caa707a42679",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from sentence_transformers import SentenceTransformer\n",
    "\n",
    "model = SentenceTransformer('moka-ai/m3e-base')\n",
    "\n",
    "#Our sentences we like to encode\n",
    "sentences = [\n",
    "    '* Moka 此文本嵌入模型由 MokaAI 训练并开源，训练脚本使用 uniem',\n",
    "    '* Massive 此文本嵌入模型通过**千万级**的中文句对数据集进行训练',\n",
    "    '* Mixed 此文本嵌入模型支持中英双语的同质文本相似度计算，异质文本检索等功能，未来还会支持代码检索，ALL in one'\n",
    "]\n",
    "\n",
    "#Sentences are encoded by calling model.encode()\n",
    "embeddings = model.encode(sentences)\n",
    "\n",
    "#Print the embeddings\n",
    "for sentence, embedding in zip(sentences, embeddings):\n",
    "    print(\"Sentence:\", sentence)\n",
    "    print(\"Embedding:\", embedding)\n",
    "    print(\"\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bdef97db-2cc5-471d-bc0e-34a908a733e6",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3.10"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.10"
  },
  "widgets": {
   "application/vnd.jupyter.widget-state+json": {
    "state": {},
    "version_major": 2,
    "version_minor": 0
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
